В курсе про регрессию пишут, что ковариационная матрица для коэффициентов регрессии нужна для рассчета стандартных ошибок оценки. Но как себе представить ковариацию коэффициентов? На самом деле, их истинных значений с нулевой вариацией мы не знаем, а знаем лишь оценки на основе данных. Т.е. сами коэффициенты являются случайными величинами, которые мы оцениваем. А посему точность их оценки может быть изучена.
library(tidyverse)
library(ggplot2)
swiss_scaled = swiss %>% mutate_all(scale)
ms = lm(Fertility ~ ., swiss_scaled)
ms %>% summary
Call:
lm(formula = Fertility ~ ., data = swiss_scaled)
Residuals:
Min 1Q Median 3Q Max
-1.22275 -0.42122 0.04029 0.32980 1.22652
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.162e-16 8.367e-02 0.000 1.00000
Agriculture -3.129e-01 1.278e-01 -2.448 0.01873 *
Examination -1.648e-01 1.621e-01 -1.016 0.31546
Education -6.704e-01 1.409e-01 -4.758 2.43e-05 ***
Catholic 3.476e-01 1.177e-01 2.953 0.00519 **
Infant.Mortality 2.511e-01 8.901e-02 2.822 0.00734 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5736 on 41 degrees of freedom
Multiple R-squared: 0.7067, Adjusted R-squared: 0.671
F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10
vcov(ms) %>% round(3)
(Intercept) Agriculture Examination Education Catholic Infant.Mortality
(Intercept) 0.007 0.000 0.000 0.000 0.000 0.000
Agriculture 0.000 0.016 0.005 0.007 -0.003 0.003
Examination 0.000 0.005 0.026 -0.013 0.011 0.000
Education 0.000 0.007 -0.013 0.020 -0.008 0.002
Catholic 0.000 -0.003 0.011 -0.008 0.014 -0.002
Infant.Mortality 0.000 0.003 0.000 0.002 -0.002 0.008
# а еще посмотрим на зависимости между переменными
ms2 = lm(Fertility ~ (.)^2, swiss_scaled)
ms2 %>% summary
Call:
lm(formula = Fertility ~ (.)^2, data = swiss_scaled)
Residuals:
Min 1Q Median 3Q Max
-0.70157 -0.31115 -0.05445 0.25119 1.12881
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.01212 0.18137 0.067 0.94715
Agriculture -0.30657 0.16000 -1.916 0.06462 .
Examination -0.20774 0.21217 -0.979 0.33511
Education -0.68458 0.20935 -3.270 0.00264 **
Catholic 0.16811 0.20964 0.802 0.42872
Infant.Mortality 0.20197 0.11445 1.765 0.08747 .
Agriculture:Examination 0.31001 0.19980 1.552 0.13091
Agriculture:Education 0.33321 0.26624 1.252 0.22009
Agriculture:Catholic 0.19914 0.21609 0.922 0.36387
Agriculture:Infant.Mortality 0.33732 0.15785 2.137 0.04060 *
Examination:Education 0.46164 0.22319 2.068 0.04703 *
Examination:Catholic -0.04082 0.28726 -0.142 0.88791
Examination:Infant.Mortality 0.31812 0.24009 1.325 0.19485
Education:Catholic -0.22894 0.32668 -0.701 0.48865
Education:Infant.Mortality 0.07530 0.27846 0.270 0.78863
Catholic:Infant.Mortality 0.09645 0.15724 0.613 0.54409
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5182 on 31 degrees of freedom
Multiple R-squared: 0.819, Adjusted R-squared: 0.7314
F-statistic: 9.352 on 15 and 31 DF, p-value: 1.077e-07
Можно теперь нагенерить разных оценок коэффициентов регрессии из разных подвыборок методом бутстрепа и посмотреть на зависимость оценок одних коэффициентов от других. По идее, это должно идейно напоминать матрицу ковариции.
R = 1000
numbers = 1:nrow(swiss_scaled)
regressors = swiss_scaled %>% colnames()
regressors[1] = "(Intercept)"
coef_df = data.frame(R = numeric(),
Coeff = character(),
Val = numeric(), stringsAsFactors = F)
count = 1
for (i in 1:R){
m_current = lm(Fertility ~ ., swiss_scaled[sample(numbers, replace = T),]) %>% coefficients()
for (regr in regressors){
coef_df[count,"R"] = i
coef_df[count,"Coeff"] = regr
coef_df[count,"Val"] = m_current[regr]
count = count + 1
}
}
vcov(ms) %>% round(3)
(Intercept) Agriculture Examination Education Catholic Infant.Mortality
(Intercept) 0.007 0.000 0.000 0.000 0.000 0.000
Agriculture 0.000 0.016 0.005 0.007 -0.003 0.003
Examination 0.000 0.005 0.026 -0.013 0.011 0.000
Education 0.000 0.007 -0.013 0.020 -0.008 0.002
Catholic 0.000 -0.003 0.011 -0.008 0.014 -0.002
Infant.Mortality 0.000 0.003 0.000 0.002 -0.002 0.008
library(GGally)
coef_df_w = coef_df %>% spread(Coeff, Val) %>% select(-R)
cov(coef_df_w) %>% round(3)
(Intercept) Agriculture Catholic Education Examination Infant.Mortality
(Intercept) 0.007 -0.001 0.001 0.004 -0.002 0.001
Agriculture -0.001 0.013 -0.002 0.008 0.004 0.000
Catholic 0.001 -0.002 0.011 -0.006 0.009 -0.002
Education 0.004 0.008 -0.006 0.030 -0.016 0.000
Examination -0.002 0.004 0.009 -0.016 0.026 0.001
Infant.Mortality 0.001 0.000 -0.002 0.000 0.001 0.010
vcov(ms) %>% round(3)
(Intercept) Agriculture Examination Education Catholic Infant.Mortality
(Intercept) 0.007 0.000 0.000 0.000 0.000 0.000
Agriculture 0.000 0.016 0.005 0.007 -0.003 0.003
Examination 0.000 0.005 0.026 -0.013 0.011 0.000
Education 0.000 0.007 -0.013 0.020 -0.008 0.002
Catholic 0.000 -0.003 0.011 -0.008 0.014 -0.002
Infant.Mortality 0.000 0.003 0.000 0.002 -0.002 0.008
ggpairs(coef_df_w)